Load data

Here, we load the surprisal data for the modified stimuli from Bradford et al (2020), calculated from each of the models.

Original stimuli were obtained from OSF: https://osf.io/pw7h6/.

BERT

# setwd("/Users/seantrott/Dropbox/UCSD/Research/NLMs/nlm-fb/src/analysis")

df_fb_bl = read_csv("../../data/processed/bradford-fb-modified_bert-large_surprisals.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   `Unnamed: 0` = col_double(),
##   Item = col_double(),
##   Condition = col_double(),
##   `Condition code` = col_character(),
##   Scenario = col_character(),
##   Modified = col_character(),
##   First_mention = col_character(),
##   Recent_mention = col_character(),
##   prior_mentions_start = col_double(),
##   prior_mentions_end = col_double(),
##   num_sentences = col_double(),
##   probability = col_double(),
##   belief = col_character(),
##   consistency = col_character(),
##   final_word = col_character(),
##   masked_passages = col_character()
## )
nrow(df_fb_bl)
## [1] 2400
df_fb_bl = df_fb_bl %>%
  mutate(condition = `Condition code`,
         log_prob = log2(probability),
         surprisal = -log_prob)

table(df_fb_bl$condition, df_fb_bl$First_mention)
##        
##         End Start
##   FB-C  120   480
##   FB-IC 120   480
##   TB-C  120   480
##   TB-IC 120   480
table(df_fb_bl$condition, df_fb_bl$consistency)
##        
##           C  IC
##   FB-C  600   0
##   FB-IC   0 600
##   TB-C  600   0
##   TB-IC   0 600
table(df_fb_bl$condition, df_fb_bl$belief)
##        
##          FB  TB
##   FB-C  600   0
##   FB-IC 600   0
##   TB-C    0 600
##   TB-IC   0 600
table(df_fb_bl$consistency, df_fb_bl$belief)
##     
##       FB  TB
##   C  600 600
##   IC 600 600
df_fb_bl = df_fb_bl %>%
  mutate(mentions_ratio = prior_mentions_start / prior_mentions_end) 

GPT-3

df_fb_gpt3 = read_csv("../../data/processed/bradford-fb-modified_gpt3_surprisals.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   `Unnamed: 0` = col_double(),
##   Item = col_double(),
##   Condition = col_double(),
##   `Condition code` = col_character(),
##   Scenario = col_character(),
##   Modified = col_character(),
##   First_mention = col_character(),
##   Recent_mention = col_character(),
##   prior_mentions_start = col_double(),
##   prior_mentions_end = col_double(),
##   num_sentences = col_double(),
##   log_prob = col_double(),
##   belief = col_character(),
##   consistency = col_character()
## )
nrow(df_fb_gpt3)
## [1] 2400
df_fb_gpt3 = df_fb_gpt3 %>%
  mutate(condition = `Condition code`,
         surprisal = -log_prob)

table(df_fb_gpt3$condition)
## 
##  FB-C FB-IC  TB-C TB-IC 
##   600   600   600   600
table(df_fb_gpt3$condition, df_fb_gpt3$consistency)
##        
##           C  IC
##   FB-C  600   0
##   FB-IC   0 600
##   TB-C  600   0
##   TB-IC   0 600
table(df_fb_gpt3$condition, df_fb_gpt3$belief)
##        
##          FB  TB
##   FB-C  600   0
##   FB-IC 600   0
##   TB-C    0 600
##   TB-IC   0 600
table(df_fb_gpt3$consistency, df_fb_gpt3$belief)
##     
##       FB  TB
##   C  600 600
##   IC 600 600
df_fb_gpt3 = df_fb_gpt3 %>%
  mutate(mentions_ratio = prior_mentions_start / prior_mentions_end) 

Visualization

BERT

## Density version
df_fb_bl %>%
  filter(Modified == "Yes") %>%
  ggplot(aes(x = surprisal,
             y = belief,
             fill = consistency)) +
  geom_density_ridges2(aes(height = ..density..), 
                       color=gray(0.25), 
                       alpha = 0.5, 
                       scale=0.85, 
                       size=.9, 
                       stat="density") +
  labs(x = "Surprisal of target word",
       y = "Belief condition") +
  geom_vline(xintercept = 0, linetype = "dotted") +
  theme_bw() +
  facet_wrap(~First_mention + Recent_mention,
             labeller = label_both)

## Strip chart version
df_fb_bl %>%
  ggplot(aes(x = belief,
             y = surprisal,
             color = consistency)) +
  geom_jitter(alpha = .1) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  labs(x = "Condition",
       y = "Surprisal (BERT)") +
  theme_bw() +
  facet_wrap(~Modified + First_mention)

df_fb_bl %>%
  ggplot(aes(x = condition,
             y = surprisal,
             color = First_mention)) +
  geom_jitter(alpha = .1) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  labs(x = "Condition",
       y = "Surprisal (BERT)") +
  theme_bw() 

GPT-3

## Density version
df_fb_gpt3 %>%
  filter(Modified == "Yes") %>%
  ggplot(aes(x = surprisal,
             y = belief,
             fill = consistency)) +
  geom_density_ridges2(aes(height = ..density..), 
                       color=gray(0.25), 
                       alpha = 0.5, 
                       scale=0.85, 
                       size=.9, 
                       stat="density") +
  labs(x = "Surprisal of target word",
       y = "Belief condition") +
  geom_vline(xintercept = 0, linetype = "dotted") +
  theme_bw() +
  facet_wrap(~First_mention +
               Recent_mention,
             labeller = label_both)

## Strip chart version
df_fb_gpt3 %>%
  ggplot(aes(x = belief,
             y = surprisal,
             color = consistency)) +
  geom_jitter(alpha = .1) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  labs(x = "Condition",
       y = "Surprisal (GPT-3)") +
  theme_bw() +
  facet_wrap(~First_mention + Recent_mention +mentions_ratio,
             labeller = label_both,
             ncol=2)

# Original condition
df_fb_gpt3 %>%
  ggplot(aes(x = condition,
             y = surprisal,
             color = condition)) +
  geom_jitter(alpha = .1) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  labs(x = "Condition",
       y = "Surprisal of target word") +
  geom_hline(yintercept = 0, linetype = "dotted") +
  theme_bw() 

## By first mention
df_fb_gpt3 %>%
  ggplot(aes(x = condition,
             y = surprisal,
             color = First_mention)) +
  geom_jitter(alpha = .1) +
  stat_summary (fun = function(x){mean(x)},
                fun.min = function(x){mean(x) - 2*sd(x)/sqrt(length(x))},
                fun.max = function(x){mean(x) + 2*sd(x)/sqrt(length(x))},
                geom= 'pointrange', 
                position=position_dodge(width=0.95)) +
  labs(x = "Condition",
       y = "Surprisal (GPT-3)") +
  geom_hline(yintercept = 0, linetype = "dotted") +
  theme_bw()  +
  facet_wrap(~Recent_mention,
             labeller = label_both)

Analysis

BERT

model_full = lmer(data = df_fb_bl,
                  surprisal ~ belief * consistency + 
                    First_mention +
                    Recent_mention +
                    mentions_ratio + 
                    (1 + belief * consistency | Item),
                  control=lmerControl(optimizer="bobyqa"),
                  REML = FALSE)
## boundary (singular) fit: see ?isSingular
model_just_fe = lmer(data = df_fb_bl,
                  surprisal ~ belief + consistency + 
                    First_mention +
                    Recent_mention +
                    mentions_ratio +
                    (1 + belief * consistency | Item),
                  control=lmerControl(optimizer="bobyqa"),
                  REML = FALSE)
## boundary (singular) fit: see ?isSingular
model_no_belief = lmer(data = df_fb_bl,
                  surprisal ~  consistency + 
                    First_mention +
                    Recent_mention +
                    mentions_ratio +
                    (1 + belief * consistency | Item),
                  control=lmerControl(optimizer="bobyqa"),
                  REML = FALSE)
## boundary (singular) fit: see ?isSingular
model_no_con = lmer(data = df_fb_bl,
                  surprisal ~  belief + 
                    First_mention +
                    Recent_mention +
                    mentions_ratio +
                    (1 + belief * consistency | Item),
                  control=lmerControl(optimizer="bobyqa"),
                  REML = FALSE)
## boundary (singular) fit: see ?isSingular
summary(model_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: surprisal ~ belief * consistency + First_mention + Recent_mention +  
##     mentions_ratio + (1 + belief * consistency | Item)
##    Data: df_fb_bl
## Control: lmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##  11859.6  11963.7  -5911.8  11823.6     2382 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.2430 -0.5066 -0.0373  0.5032  4.8401 
## 
## Random effects:
##  Groups   Name                   Variance Std.Dev. Corr             
##  Item     (Intercept)             21.223   4.607                    
##           beliefTB                58.554   7.652   -0.78            
##           consistencyIC           61.153   7.820   -0.77  1.00      
##           beliefTB:consistencyIC 232.682  15.254    0.77 -1.00 -1.00
##  Residual                          5.563   2.359                    
## Number of obs: 2400, groups:  Item, 120
## 
## Fixed effects:
##                          Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)               4.87157    0.44485  135.67961  10.951   <2e-16 ***
## beliefTB                  1.49304    0.71168  120.06469   2.098   0.0380 *  
## consistencyIC             1.16954    0.72674  120.00084   1.609   0.1102    
## First_mentionStart        0.34121    0.14095 2040.00061   2.421   0.0156 *  
## Recent_mentionStart       0.24889    0.14950 2040.00047   1.665   0.0961 .  
## mentions_ratio           -0.22114    0.09967 2040.00039  -2.219   0.0266 *  
## beliefTB:consistencyIC   -3.35726    1.40574  120.00068  -2.388   0.0185 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) belfTB cnssIC Frst_S Rcnt_S mntns_
## beliefTB    -0.750                                   
## consstncyIC -0.745  0.982                            
## Frst_mntnSt -0.132  0.000  0.000                     
## Rcnt_mntnSt  0.062  0.000  0.000  0.079              
## mentions_rt -0.112  0.000  0.000 -0.471 -0.556       
## blfTB:cnsIC  0.736 -0.991 -0.991  0.000  0.000  0.000
## convergence code: 0
## boundary (singular) fit: see ?isSingular
anova(model_full, model_just_fe)
## Data: df_fb_bl
## Models:
## model_just_fe: surprisal ~ belief + consistency + First_mention + Recent_mention + 
## model_just_fe:     mentions_ratio + (1 + belief * consistency | Item)
## model_full: surprisal ~ belief * consistency + First_mention + Recent_mention + 
## model_full:     mentions_ratio + (1 + belief * consistency | Item)
##               npar   AIC   BIC  logLik deviance  Chisq Df Pr(>Chisq)  
## model_just_fe   17 11863 11962 -5914.6    11829                       
## model_full      18 11860 11964 -5911.8    11824 5.5723  1    0.01825 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model_just_fe, model_no_belief)
## Data: df_fb_bl
## Models:
## model_no_belief: surprisal ~ consistency + First_mention + Recent_mention + mentions_ratio + 
## model_no_belief:     (1 + belief * consistency | Item)
## model_just_fe: surprisal ~ belief + consistency + First_mention + Recent_mention + 
## model_just_fe:     mentions_ratio + (1 + belief * consistency | Item)
##                 npar   AIC   BIC  logLik deviance Chisq Df Pr(>Chisq)  
## model_no_belief   16 11865 11957 -5916.4    11833                      
## model_just_fe     17 11863 11962 -5914.6    11829 3.728  1    0.05351 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model_just_fe, model_no_con)
## Data: df_fb_bl
## Models:
## model_no_con: surprisal ~ belief + First_mention + Recent_mention + mentions_ratio + 
## model_no_con:     (1 + belief * consistency | Item)
## model_just_fe: surprisal ~ belief + consistency + First_mention + Recent_mention + 
## model_just_fe:     mentions_ratio + (1 + belief * consistency | Item)
##               npar   AIC   BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## model_no_con    16 11888 11980 -5928.0    11856                         
## model_just_fe   17 11863 11962 -5914.6    11829 26.783  1  2.276e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df_tidy_results = broom.mixed::tidy(model_full)

df_tidy_results %>%
  filter(effect == "fixed") %>%
  ggplot(aes(x = term,
             y = estimate)) +
  geom_point() +
  coord_flip() +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_errorbar(aes(ymin = estimate - 2*std.error, 
                    ymax = estimate + 2*std.error), 
                width=.2,
                position=position_dodge(.9)) +
  theme_minimal()

GPT-3

model_full = lmer(data = df_fb_gpt3,
                  surprisal ~ belief * consistency + 
                    First_mention +
                    Recent_mention + 
                    mentions_ratio +
                    (1 + belief * consistency | Item),
                  control=lmerControl(optimizer="bobyqa"),
                  REML = FALSE)
## boundary (singular) fit: see ?isSingular
## Warning: Model failed to converge with 1 negative eigenvalue: -2.1e+01
model_just_fe = lmer(data = df_fb_gpt3,
                  surprisal ~ belief + consistency + 
                    First_mention +
                    Recent_mention + 
                    mentions_ratio +
                    (1 + belief * consistency | Item),
                  control=lmerControl(optimizer="bobyqa"),
                  REML = FALSE)
## boundary (singular) fit: see ?isSingular
## Warning: Model failed to converge with 1 negative eigenvalue: -5.4e+00
model_no_belief = lmer(data = df_fb_gpt3,
                  surprisal ~  consistency + 
                    First_mention +
                    Recent_mention + 
                    mentions_ratio +
                    (1 + belief * consistency | Item),
                  control=lmerControl(optimizer="bobyqa"),
                  REML = FALSE)

model_no_con = lmer(data = df_fb_gpt3,
                  surprisal ~  belief + 
                    First_mention +
                    Recent_mention + 
                    mentions_ratio +
                    (1 + belief * consistency | Item),
                  control=lmerControl(optimizer="bobyqa"),
                  REML = FALSE)

summary(model_full)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: surprisal ~ belief * consistency + First_mention + Recent_mention +  
##     mentions_ratio + (1 + belief * consistency | Item)
##    Data: df_fb_gpt3
## Control: lmerControl(optimizer = "bobyqa")
## 
##      AIC      BIC   logLik deviance df.resid 
##   4771.9   4875.9  -2367.9   4735.9     2382 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.4440 -0.4745  0.0057  0.4309  8.7051 
## 
## Random effects:
##  Groups   Name                   Variance Std.Dev. Corr             
##  Item     (Intercept)             2.2363  1.4954                    
##           beliefTB                3.8264  1.9561   -0.76            
##           consistencyIC           4.0018  2.0004   -0.74  1.00      
##           beliefTB:consistencyIC 15.3332  3.9158    0.75 -1.00 -1.00
##  Residual                         0.2705  0.5201                    
## Number of obs: 2400, groups:  Item, 120
## 
## Fixed effects:
##                          Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)               2.27642    0.14021  127.28760  16.236  < 2e-16 ***
## beliefTB                 -0.80112    0.18108  119.99933  -4.424 2.14e-05 ***
## consistencyIC            -0.68670    0.18507  120.00804  -3.711 0.000315 ***
## First_mentionStart        0.20485    0.03108 2039.68255   6.591 5.55e-11 ***
## Recent_mentionStart      -0.09216    0.03297 2039.68251  -2.796 0.005230 ** 
## mentions_ratio           -0.15575    0.02198 2039.68254  -7.087 1.88e-12 ***
## beliefTB:consistencyIC    1.43933    0.35997  119.99971   3.998 0.000111 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) belfTB cnssIC Frst_S Rcnt_S mntns_
## beliefTB    -0.750                                   
## consstncyIC -0.733  0.986                            
## Frst_mntnSt -0.092  0.000  0.000                     
## Rcnt_mntnSt  0.044  0.000  0.000  0.079              
## mentions_rt -0.078  0.000  0.000 -0.471 -0.556       
## blfTB:cnsIC  0.735 -0.992 -0.992  0.000  0.000  0.000
## convergence code: 0
## boundary (singular) fit: see ?isSingular
anova(model_full, model_just_fe)
## Data: df_fb_gpt3
## Models:
## model_just_fe: surprisal ~ belief + consistency + First_mention + Recent_mention + 
## model_just_fe:     mentions_ratio + (1 + belief * consistency | Item)
## model_full: surprisal ~ belief * consistency + First_mention + Recent_mention + 
## model_full:     mentions_ratio + (1 + belief * consistency | Item)
##               npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)    
## model_just_fe   17 4784.1 4882.5 -2375.1   4750.1                         
## model_full      18 4771.9 4875.9 -2367.9   4735.9 14.293  1  0.0001564 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model_just_fe, model_no_belief)
## Data: df_fb_gpt3
## Models:
## model_no_belief: surprisal ~ consistency + First_mention + Recent_mention + mentions_ratio + 
## model_no_belief:     (1 + belief * consistency | Item)
## model_just_fe: surprisal ~ belief + consistency + First_mention + Recent_mention + 
## model_just_fe:     mentions_ratio + (1 + belief * consistency | Item)
##                 npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)   
## model_no_belief   16 4791.2 4883.8 -2379.6   4759.2                        
## model_just_fe     17 4784.1 4882.5 -2375.1   4750.1 9.0841  1   0.002578 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model_just_fe, model_no_con)
## Data: df_fb_gpt3
## Models:
## model_no_con: surprisal ~ belief + First_mention + Recent_mention + mentions_ratio + 
## model_no_con:     (1 + belief * consistency | Item)
## model_just_fe: surprisal ~ belief + consistency + First_mention + Recent_mention + 
## model_just_fe:     mentions_ratio + (1 + belief * consistency | Item)
##               npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)  
## model_no_con    16 4785.2 4877.7 -2376.6   4753.2                       
## model_just_fe   17 4784.1 4882.5 -2375.1   4750.1 3.0206  1    0.08221 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df_tidy_results = broom.mixed::tidy(model_full)

df_tidy_results %>%
  filter(effect == "fixed") %>%
  ggplot(aes(x = term,
             y = estimate)) +
  geom_point() +
  coord_flip() +
  geom_hline(yintercept = 0, linetype = "dotted") +
  geom_errorbar(aes(ymin = estimate - 2*std.error, 
                    ymax = estimate + 2*std.error), 
                width=.2,
                position=position_dodge(.9)) +
  theme_minimal()